Construir um classificador que possa, utilizando dados disponíveis após o término do primeiro período letivo (semestre ou ano) prever se o aluno irá evadir ou não.
In [1]:
%matplotlib inline
#import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm as cm
from pandas.tools.plotting import scatter_matrix
from pandas import DataFrame
from sklearn import cross_validation
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import LabelBinarizer
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import CCA
In [2]:
df = pd.read_csv('../../../input/alunos.csv', header=0, sep=';')
df.dtypes
Out[2]:
In [3]:
df.head(10)
Out[3]:
In [4]:
#numero de nulls em cada coluna
df.isnull().sum()
Out[4]:
In [5]:
df.fillna(value=0, inplace=True)
df.isnull().sum()
Out[5]:
In [6]:
df.groupby('evadiu').size()
Out[6]:
In [7]:
df.groupby('evadiu').size().plot(kind='bar')
plt.show()
In [8]:
def pizza_evazao(qtd_evadiu, qtd_nao_evadiu):
labels = 'Evadiu', 'Nao Evadiu'
sizes = [qtd_evadiu, qtd_nao_evadiu]
colors = ['gold','lightskyblue']
explode = (0.1, 0) # explode 1st slice
# Plot
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
autopct='%1.1f%%', shadow=True, startangle=140)
plt.show()
qtd_evadiu = df.groupby('evadiu').size()[1]
qtd_nao_evadiu = df.groupby('evadiu').size()[0]
pizza_evazao(qtd_evadiu, qtd_nao_evadiu)
In [27]:
df.hist(figsize=(30, 20), bins=20)
plt.show()
In [10]:
df.describe()
Out[10]:
In [11]:
df['distancia_conclusao_2grau'].hist(bins=50)
plt.show()
In [12]:
df_temp = df[df['distancia_conclusao_2grau'] > 10]
df_temp['distancia_conclusao_2grau'].hist(bins=45)
plt.show()
In [13]:
df.groupby('evadiu').describe()
Out[13]:
In [14]:
df.drop('hash_cod_matricula', axis=1, inplace=True)
df.head(10)
Out[14]:
In [15]:
ncolumns = df.shape[1]
array = df.values
X = array[:,0:ncolumns-1].astype(float)
Y = array[:,ncolumns-1]
X_train, X_validation, Y_train, Y_validation = cross_validation.train_test_split(X, Y, test_size=0.20, random_state=7)
In [16]:
print 'X_train/Y_train:'
print X_train.shape
print Y_train.shape
print
print 'X_validation/Y_validation:'
print X_validation.shape
print Y_validation.shape
In [17]:
def trainDummyClassifier(X_train, Y_train):
print '\nTraining ...'
# “stratified”: generates predictions by respecting the training set’s class distribution.
# “uniform”: generates predictions uniformly at random.
model = DummyClassifier(strategy='stratified', random_state=7)
#train
model.fit(X_train, Y_train)
return model
trainedDummyModel = trainDummyClassifier(X_train, Y_train)
print 'Done!'
In [18]:
predictions = trainedDummyModel.predict(X_validation)
print 'X_validatinon:', len(X_validation)
print 'predictions:', len(predictions)
print predictions
In [19]:
qtd_evadiu = len(np.where(predictions == 1)[0])
qtd_nao_evadiu = len(np.where(predictions == 0)[0])
pizza_evazao(qtd_evadiu, qtd_nao_evadiu)
In [20]:
def test_accuracy(predictions, X_validation, Y_validation):
print '\n=== Model Accuracy ==='
print '\naccuracy_score:'
print(accuracy_score(Y_validation, predictions))
print '\nconfusion_matrix:'
print '=> By definition a confusion matrix C is such that C_{i, j} is equal to the number of observations known to be in group i but predicted to be in group j.'
print(confusion_matrix(Y_validation, predictions))
print '\nclassification_report:'
print '=> http://machinelearningmastery.com/classification-accuracy-is-not-enough-more-performance-measures-you-can-use/'
print(classification_report(Y_validation, predictions))
test_accuracy(predictions, X_validation, Y_validation)
Mas o que é RandomForest? Pausa para um breve overview de Random Forest
http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
In [21]:
def trainRandomForestClassifier(X_train, Y_train):
print '\nTraining ...'
model = RandomForestClassifier(max_features='log2', n_estimators=500, random_state=7, class_weight='balanced')
#train
model.fit(X_train, Y_train)
return model
trainedRFModel = trainRandomForestClassifier(X_train, Y_train)
print 'Done!'
In [22]:
qtd_evadiu = len(np.where(predictions == 1)[0])
qtd_nao_evadiu = len(np.where(predictions == 0)[0])
pizza_evazao(qtd_evadiu, qtd_nao_evadiu)
In [23]:
predictions = trainedRFModel.predict(X_validation)
test_accuracy(predictions, X_validation, Y_validation)
In [24]:
# http://scikit-learn.org/stable/auto_examples/plot_multilabel.html#sphx-glr-auto-examples-plot-multilabel-py
def plot_hyperplane(clf, min_x, max_x, linestyle, label):
# get the separating hyperplane
w = clf.coef_[0]
a = -w[0] / w[1]
xx = np.linspace(min_x - 5, max_x + 5) # make sure the line is long enough
yy = a * xx - (clf.intercept_[0]) / w[1]
plt.plot(xx, yy, linestyle, label=label)
def plot_subfigure(X, Y, subplot, title, transform):
if transform == "pca":
X = PCA(n_components=2).fit_transform(X)
elif transform == "cca":
X = CCA(n_components=2).fit(X, Y).transform(X)
else:
raise ValueError
min_x = np.min(X[:, 0])
max_x = np.max(X[:, 0])
min_y = np.min(X[:, 1])
max_y = np.max(X[:, 1])
classif = OneVsRestClassifier(SVC(kernel='linear'))
classif.fit(X, Y)
plt.subplot(2, 2, subplot)
plt.title(title)
zero_class = np.where(Y[:, 0])
one_class = np.where(Y[:, 1])
plt.scatter(X[:, 0], X[:, 1], s=40, c='gray')
plt.scatter(X[zero_class, 0], X[zero_class, 1], s=160, edgecolors='b',
facecolors='none', linewidths=2, label='Class 1')
plt.scatter(X[one_class, 0], X[one_class, 1], s=80, edgecolors='orange',
facecolors='none', linewidths=2, label='Class 2')
plot_hyperplane(classif.estimators_[0], min_x, max_x, 'k--',
'Boundary\nfor class 1')
plot_hyperplane(classif.estimators_[1], min_x, max_x, 'k-.',
'Boundary\nfor class 2')
plt.xticks(())
plt.yticks(())
plt.xlim(min_x - .5 * max_x, max_x + .5 * max_x)
plt.ylim(min_y - .5 * max_y, max_y + .5 * max_y)
if subplot == 2:
plt.xlabel('First principal component')
plt.ylabel('Second principal component')
plt.legend(loc="upper left")
In [25]:
x = X_validation
y0 = Y_validation
y1 = 1 - Y_validation
y = np.column_stack((y0.reshape(len(y0), 1), y1.reshape(len(y1), 1)))
plt.figure(figsize=(40, 30))
data_samples = 200
plot_subfigure(x[:data_samples], y[:data_samples], 4, "PCA - 2D", "pca")
plt.show()
In [26]:
n_estimators_values = [50, 200]
max_features_values = [1, 0.1, 'log2']
param_grid = dict(n_estimators=n_estimators_values, max_features=max_features_values)
results = []
model = RandomForestClassifier(verbose=0)
# validação cruzada é um mecanismo muito importante para manter o modelo genérico o suficiente para
# gerar boas predições em datasets desconhecidos.
kfold = cross_validation.KFold(n=len(X_train), n_folds=10, random_state=7)
grid = GridSearchCV(n_jobs=5, estimator=model, param_grid=param_grid, scoring='accuracy', cv=kfold)
grid_result = grid.fit(X_train, Y_train)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
cv_results = grid_result.cv_results_['mean_test_score']
results.append(cv_results)
grid_scores = sorted(grid_result.grid_scores_, key=lambda x: x[2].mean(), reverse=True)
for param, mean_score, scores in grid_scores:
print("%f (%f) with: %r" % (scores.mean(), scores.std(), param))